#!/usr/bin/perl
use strict;
use warnings;
use List::Util qw(min max);
use Scalar::Util qw(looks_like_number);
use Cwd 'abs_path';
use Getopt::Long;
use lib './scripts';
use ConservatoryUtils;
use CNSDatabase;
use MappingDatabase;
use ConservatoryTree;

$|=1;

my $inCNSFileName;
my $inMapFileName;
my $outCNSFileName;
my $outRejectFileName;
my $outMapFileName;
my $help =0;

my $maxCNSLength=250;
my $startCNSNameIndex=1;
my $filterDeepCNS=0;
my $filterRepetitive=0;
my $filterPlastid=0;
my $filterLongCNS=0;
my $fixPositions=0;
my $fixGenome;
my $assignBySynteny=0;
my $renameCNS=0;
my $gffOutputFileName;
my $makegffReference;
my $sortMapFile;

my $filterRepetitiveCNSCutoff = 0.005; ### Filter the top 0.005%
my $regionSearchFlank=75;

my $verbose=0;

my $conservatoryDir=abs_path(".");
my %CNSTable;

GetOptions ("conservatoryDirectory=s" => \$conservatoryDir,
			"in-cns=s" => \$inCNSFileName,
			"in-map=s" => \$inMapFileName,
			"out-cns=s" => \$outCNSFileName,
			"out-map=s" => \$outMapFileName,
            "out-reject=s" => \$outRejectFileName,
            "filter-deep-cns" => \$filterDeepCNS,
            "filter-repetitive" => \$filterRepetitive,
            "filter-plastid" => \$filterPlastid,
            "filter-long" => \$filterLongCNS,
            "fix-positions" => \$fixPositions,
            "fix-genome=s" => \$fixGenome,
            "rename-cns" => \$renameCNS,
            "name-index-start=n" => \$startCNSNameIndex,
            "assign-by-synteny" => \$assignBySynteny,
            "make-gff=s" => \$gffOutputFileName,
            "gff-reference=s" => \$makegffReference,
            "sort" => \$sortMapFile,
			"verbose" => \$verbose,
			"help" => \$help) or die ("Error in command line arguments\n");

if($help || !defined $inCNSFileName || !defined $inMapFileName || $inCNSFileName eq "" || $inMapFileName eq "") {
	print "Conservatory version 2.0.1\n\n";

	print "CNSUtils --in-cns <cnsFile> --in-map <cnsPositionMapFile> [--out-cns <cnsFile>] [--out-map <cnsPositionMapFile>].\n";
    print "\t\t[--filter-deep-cns] [--filter-repetitive] [--filter-plastid] [--filter-long] [--fix-positions] [--make-gff <output gff filename> --gff-reference <reference genome>] [--verbose]\n\n";
    print "\n\nFilter the merged CNS file based on the following rules:\n";
    print "   - Remove Deep CNS (deeper than node N14) that are only called by one reference\n";
    print "   - Remove highly repetitive sequences\n";
    print "   - Remove plastid CNS\n\n";
    print "   - Fix positions Verify (and correct if needed) the positions of the CNS in all genomes\n";
    print "   - Make GFF - create a GFF file\n\n\n";
	
	exit();
}

if($verbose) { print "Starting filtering. $inCNSFileName, $inMapFileName. Filters: $filterDeepCNS $filterRepetitive $filterPlastid $filterLongCNS $fixPositions.\n"; }

my $genomeDB = new GenomeDatabase($conservatoryDir);
my $conservatoryTree = new ConservatoryTree($genomeDB);

### Read CNS file
my $CNSDB = new CNSDatabase($inCNSFileName, $verbose);

if(defined $gffOutputFileName) {
    if(!defined $makegffReference) { die "ERROR: No reference provided for GFF.\n";}
    my $mappingDB = new MappingDatabase("grep $makegffReference $inMapFileName |", 1, $makegffReference);

    open(my $gffOutputFile, ">$gffOutputFileName");
    print $gffOutputFile "##gff-version 3\n";
    
    foreach my $curCNS ( @{ $CNSDB->getCNSByOrder() }) {
        my $locusMerge="";
        $mappingDB->mergeOverlappingMappingsForCNS($curCNS, 1); ### Merge overlapping mappings for the CNS, ignoring the associated locus (to get one combined sequence)
        my @mappings = @{ $mappingDB->getMappingsForCNS($curCNS) };

        if(scalar @mappings >0) {
            $locusMerge = substr($locusMerge, 0, length($locusMerge)-1);
            my $node = $curCNS->getConservationLevel();
            my $conservationLevel = nodeToLevel($node);

            @mappings = reverse sort { $a->getAbsChr() cmp $b->getAbsChr() ||
                                       $a->getAbsPos() cmp $b->getAbsPos() } @mappings;

            my $lastMapping;

            foreach my $curMapping (@mappings) {
                if(geneToGenome($curMapping->getLocus()) ne $makegffReference || (defined $lastMapping && $lastMapping->overlap($curMapping)==1)) { next; }

                my @associatedLoci = split /\|/, $curMapping->getLocus();
                my @associatedLociShortName;

                foreach my $curLoci (@associatedLoci) {
                    push @associatedLociShortName, geneToLocus($curLoci);
                }


                print $gffOutputFile join("\t", $curMapping->getAbsChr(),
                                          "Conservatory2",
                                          "CNS",
                                          $curMapping->getAbsPos(),
                                          $curMapping->getAbsPos() + $curMapping->getAbsLen() -1,
                                          ".",
                                          $curMapping->getStrandInGenome(),
                                          ".",
                                          "associated_loci=" . join("|", sort @associatedLociShortName) .
                                          ";Name=" . $curCNS->getID() . ";species=" . $curCNS->getSupportingSpeciesNumber() . 
                                          ";node=" . $node . 
                                          ";level=" . $conservationLevel . 
                                          ";original_CNS=" . $curCNS->getLocus() .
                                          ";ancesteral=" . $curCNS->getSeq() .  "\n");

                $lastMapping= $curMapping;
            }
        }
    }
    close($gffOutputFile);
    ## Now sort the GFF
    system("{ grep '^##' $gffOutputFileName; grep -v '^##' $gffOutputFileName | sort -k1,1 -k4,4n -k5,5n; } > $gffOutputFileName.tmp && mv $gffOutputFileName.tmp $gffOutputFileName");
    exit(0);
}


my $mappingDB = new MappingDatabase($inMapFileName, 0);
if($assignBySynteny) {
    $mappingDB->renameMappings();
    $mappingDB->orderMappingByPosition();
    my $lastMapping;
    foreach my $curMapping (@{ $mappingDB->getMappingsByOrder()}) {
        if(!$CNSDB->exists($curMapping->getCNSID())) {
            $mappingDB->deleteMapping($curMapping);
            next;
        }
        if(defined $lastMapping && $lastMapping->overlap($curMapping)) {
            ## If these are two genes for the same CNS, see which one has better synteny
            if($curMapping->getLocus() eq $lastMapping->getLocus() && $curMapping->getCNSID() eq $lastMapping->getCNSID()) {
                $curMapping->merge($lastMapping);
                $mappingDB->deleteMapping($lastMapping);
                $lastMapping=$curMapping;
            } elsif ($curMapping->getLocus() eq $lastMapping->getLocus() && $curMapping->getCNSID() ne $lastMapping->getCNSID()) {
                my $lastCNS = $CNSDB->getCNSByID($lastMapping->getCNSID());
                my $curCNS = $CNSDB->getCNSByID($curMapping->getCNSID());
                my $syntenyComparison = $lastCNS->compareSynteny($curCNS, $mappingDB, $conservatoryTree );
                if($syntenyComparison ==1) { ##if it is more syntentic with the last cns, remove the mapping
                    $mappingDB->deleteMapping($curMapping);
                } elsif($syntenyComparison ==2) {
                    $mappingDB->deleteMapping($lastMapping);
                    $lastMapping = $curMapping;
                } 
            } elsif($curMapping->getLocus() ne $lastMapping->getLocus() && $curMapping->getCNSID() eq $lastMapping->getCNSID()) { ## if it is the same CNS but different gene
                ### assign by default to the closest gene.
                ### alternative is looking at broader synteny?
                if(abs($curMapping->getPos()) < abs($lastMapping->getPos())) {
                    $mappingDB->deleteMapping($lastMapping);
                    $lastMapping=$curMapping;
                } else {
                    $mappingDB->deleteMapping($curMapping);
                }
            }
        } else {
            $lastMapping = $curMapping;
        }
    }
}

if($fixPositions) {
    my $totalVerifiedPositions=0;
    my $totalFixedPositions=0;
    my $totalRejectedPositions=0;

    my %genomeToFamily;

    if($verbose) { print "PROGRESS: Begin verifying positions\n"; }

    my $curGenome="";
    my $curGenomeSequence;

    my $inSortedMapFile;
    if($sortMapFile) {
        $mappingDB->orderMappingByGenome();
    }
    my $outMapRejectFile;
    if(defined $outRejectFileName) {
        open ($outMapRejectFile ,">" , $outRejectFileName);
    }

    foreach my $curMapping (@{ $mappingDB->getMappingsByOrder() }) {
        
       if( (defined $fixGenome && ($curMapping->getGenome() ne $fixGenome)) ||  !$curMapping->hasAbsCoordiantes() ) { next; }

       if(geneToGenome($curMapping->getLocus()) ne $curGenome) {
            clearGeneCooordinateDatabase();
            $curGenome = geneToGenome($curMapping->getLocus());
            $curGenomeSequence = $genomeDB->readGenome($curGenome);
        }

        if($verbose) { print "PROGRESS: Processing mappings..." . $totalVerifiedPositions++ . " ($curGenome)....\r" };

        if($curMapping->getAbsEnd() > length($curGenomeSequence->{ cleanChrName($curMapping->getAbsChr()) })+1) {
            if(defined $outRejectFileName) {
                $curMapping->print($outMapRejectFile);
            }
            $mappingDB->deleteMapping($curMapping);
            next;
        }

        my $newAbsStart = remap($curMapping, $curGenomeSequence);

        if($newAbsStart == -1)  { ## If we failed, try the reverse complement
            $curMapping->flipKeepStrand();
            $newAbsStart = remap($curMapping, $curGenomeSequence);
        }

        if($newAbsStart != -1) {
            ## Get the absolute coordiantes for the gene to update the relative position
            my %geneCoordinates = getGeneCoordinates($genomeDB->getConservatoryDir(), $genomeDB->genomeToFamily( geneToGenome($curMapping->getLocus() ) ),  geneToGenome($curMapping->getLocus()), $curMapping->getLocus());
            if($curMapping->getAbsPos() != $newAbsStart) {
                $totalFixedPositions++;
                $curMapping->setAbsPos($newAbsStart);
                $curMapping->setRelPosFromAbs($newAbsStart, \%geneCoordinates);            
            }
        } else {  ### Else, reject
            $curMapping->flipKeepStrand(); ## Flip back.
            $curMapping->print($outMapRejectFile);
            $mappingDB->deleteMapping($curMapping);
            $totalRejectedPositions++
        }
    }
    if(defined $outRejectFileName) {
        close($outMapRejectFile);
    }

    if($verbose) { print "PROGRESS: Done verifying positions. Verified $totalVerifiedPositions. Fixed $totalFixedPositions. Rejected $totalRejectedPositions.\n"; } 
}

### Now Begin filtering
if($filterPlastid) {
    my $plastidFiltered=0;

    foreach my $curMapping ( @{ $mappingDB->getMappingsByOrder() }) {
        if ($curMapping->isPlastid() && $CNSDB->exists($curMapping->getCNSID())) {
            $CNSDB->getCNSByID( $curMapping->getCNSID() )->setComment("PlastidFilter");
            $plastidFiltered++;
        }
    }
    if($verbose) { print "PROGRESS: Plastid Filter: Removed $plastidFiltered CNS\n";}
}
my $longFiltered=0;

if($filterLongCNS) {
    foreach my $curCNS (  @{ $CNSDB->getCNSByOrder()} ) {
        if($curCNS->getLen() > $maxCNSLength) {
           $curCNS->setComment("TooLong");
           $longFiltered++;
        }
    }

    if($verbose) { print "Filtered $longFiltered with length > $maxCNSLength...\n"; }    
}


if($filterRepetitive) {
    my $repetitiveFiltered=0;
    ## First, establish cutoff
    my @posPerCNS;

    foreach my $curCNS ( @{ $CNSDB->getCNSByOrder()}) {
        my $mappingsPerCNS = $mappingDB->getMappingsForCNS( $curCNS->getID());
        if(defined $mappingsPerCNS) {
            push @posPerCNS, scalar @{ $mappingsPerCNS };
        } else {
            $CNSDB->deleteCNS($curCNS);
        }
    }
    @posPerCNS = reverse sort { $a <=> $b} @posPerCNS;
    my $filterValue = $posPerCNS[ scalar @posPerCNS * ($filterRepetitiveCNSCutoff/100) ];

    foreach my $curCNS (  @{ $CNSDB->getCNSByOrder()} ) {
        if(!defined $curCNS->getComment() && scalar @{ $mappingDB->getMappingsForCNS( $curCNS->getID())  } > $filterValue)  {
            $curCNS->setComment("RepetitiveFilter");
            $repetitiveFiltered++;
        }
    }
    if($verbose) { print "Filtering highly repetitive CNS (CNS with more than $filterValue positions). Removed $repetitiveFiltered CNS\n"; }
}

if($filterDeepCNS) {
    if($verbose) { print "PROGRESS: Start Deep Filter..";}    
    my $deepCNSFiltered=0;
    my $deepCNSTrimmed=0;
    my @angiosperms = $genomeDB->getSpeciesForLevel(2, "Angiosperms");
    my @nonAngiosperms = grep { my $element = $_; not grep {$_ eq $element } @angiosperms } $genomeDB->getSpecies();

    foreach my $curCNS ( @{ $CNSDB->getCNSByOrder()}) {

        my $nodeLevel = $curCNS->getConservationLevel();
        if($nodeLevel eq "") {
            my @speciesForCNS =  @{ $curCNS->getSupportingSpecies($mappingDB) };            
            $curCNS->setConservationLevel($conservatoryTree->findDeepestCommonNode(\@speciesForCNS));        
            $nodeLevel = $curCNS->getConservationLevel();
        }
        $nodeLevel =~ s/N//;

        # for the deep CNS. Apply multiple filters. First, it must be found in more than one reference
        if(! looks_like_number($nodeLevel) ) {
            print "\n Node level ($nodeLevel) is not a number. Conservation: " . $curCNS->getConservationLevel() . "\n";
            $curCNS->print;
        }
        if(!defined $curCNS->getComment() && ($nodeLevel + 0 <=14)) {
            if(($curCNS->getNumOfRefGenomes() <2) ) {
                $curCNS->setComment("DeepFilter");
                $deepCNSFiltered++;
            } elsif(($nodeLevel +0 <=8) && ($curCNS->getNumOfRefGenomes() <3)) {  ### For the deepest ones, make sure we have more than 2 ref genomes
                $curCNS->setComment("DeepFilter");
                $deepCNSFiltered++;
            } else {  ### Make sure these are not sproious mappings that are only supported by one species
                if($verbose) { print "."; }
                my @speciesForCNS =  @{ $curCNS->getSupportingSpecies($mappingDB) };
                my @speciesToTest = sort grep { my $element=$_; grep {$_ eq $element} @speciesForCNS } @nonAngiosperms;
                my @childNodes;
                my @speciesToTrim;
                my $trimmed;
                do {
                    $trimmed=0;
                    @speciesToTest = grep { my $element=$_; not grep {$_ eq $element} @speciesToTrim } @speciesToTest;
                    my @trimmedSpeciesForCNS = grep { my $element=$_; not grep {$_ eq $element} @speciesToTrim } @speciesForCNS;
                    $curCNS->setConservationLevel($conservatoryTree->findDeepestCommonNode(\@trimmedSpeciesForCNS));

                    @childNodes = $conservatoryTree->getChildren($curCNS->getConservationLevel());

                    foreach my $curSpeciesToOmit (@speciesToTest) {
                        my @speciesLessOne = grep { $_ ne $curSpeciesToOmit } @speciesForCNS;
                        my $newNode = $conservatoryTree->findDeepestCommonNode(\@speciesLessOne);

                        ### We removed one species. If that changes the conservation level by more than one junction in the tree, remove the species
                        if($newNode ne $curCNS->getConservationLevel() && !(grep {$_ eq $newNode} @childNodes) ) {
                            push @speciesToTrim, $curSpeciesToOmit;
                            $trimmed = 1;
                        }
                    }
                } while($trimmed);
                ### Now remove the mappings
                foreach my $curMapping (@{ $mappingDB->getMappingsForCNS($curCNS) }) {
                    if(grep {$_ eq $curMapping->getSpecies()} @speciesToTrim ) {
                        $mappingDB->deleteMapping($curMapping);
                        $deepCNSTrimmed++;
                    }
                }
                my @newSpeciesForCNS = @{ $curCNS->getSupportingSpecies($mappingDB) };
                $curCNS->setConservationLevel($conservatoryTree->findDeepestCommonNode(\@newSpeciesForCNS));
            }
        }
    }

    ##### filter by number of species and conservation level (for deep CNSs)
    foreach my $curCNS (@{ $CNSDB->getCNSByOrder()} ) {
        my $conservationLevel = nodeToLevel($curCNS->getConservationLevel());

        my $minSpeciesToFilter=0;
        if($conservationLevel eq "Ancient") {
            $minSpeciesToFilter = $minSpeciesPerConservationLevel[2];
        } elsif($conservationLevel eq "Tracheophytes") {
            $minSpeciesToFilter = $minSpeciesPerConservationLevel[3];
        } elsif($conservationLevel eq "Angiosperms") {
            $minSpeciesToFilter = $minSpeciesPerConservationLevel[4];
        } elsif($conservationLevel eq "Dicots" || $conservationLevel eq "Monocots" ) {
            $minSpeciesToFilter = $minSpeciesPerConservationLevel[5];
        }

        if($curCNS->getSupportingSpeciesNumber($mappingDB) <= $minSpeciesToFilter) {
            $curCNS->setComment("LowSupportingSpeciesFilter");
            $deepCNSFiltered++;
        }
    }

    if($verbose) { print "\nDeep CNS Filter: Removed $deepCNSFiltered CNS, trimmed $deepCNSTrimmed mappings.\n"; }
}

if($renameCNS) {        ### Rename all the CNSs to give consistent name based on their conservation level and a unique ID.
    my %CNSNameIndices;
    foreach my $curCNS ( @{ $CNSDB->getCNSByOrder() }) {
        my $conservationLevel;

        $conservationLevel = nodeToLevel($curCNS->getConservationLevel());

        if(!defined $CNSNameIndices{$conservationLevel}) { $CNSNameIndices{$conservationLevel}= $startCNSNameIndex; }
        my $newCNSName = $conservationLevel . "_" . $CNSNameIndices{$conservationLevel};
        $mappingDB->renameCNSMappings($curCNS, $newCNSName);
        $CNSDB->renameCNS($curCNS, $newCNSName);
        $CNSNameIndices{$conservationLevel}++;
    }
}


## open Reject DB
my $rejectCNSDB = new CNSDatabase();

foreach my $curCNS (  @{ $CNSDB->getCNSByOrder()} ) {
    if(defined $curCNS->getComment()) {
        $rejectCNSDB->add($curCNS);
    }
}
if(defined $outRejectFileName && !$fixPositions && $rejectCNSDB->getNumberOfCNSs()>0) {
    $rejectCNSDB->writeDatabase($outRejectFileName);
}

foreach my $curCNS (  @{ $CNSDB->getCNSByOrder()} ) {
    if(defined $curCNS->getComment()) {
        $CNSDB->deleteCNS($curCNS);
        $mappingDB->deleteCNS($curCNS);
    }
}


if(defined $outCNSFileName) {
    $CNSDB->writeDatabase($outCNSFileName);
}


if(defined $outMapFileName) {
    $mappingDB->writeDatabase($outMapFileName);
}


##################################################################################################################
##################################################################################################################
##################################################################################################################

sub compareSequences {
    my ($seq1, $seq2) = @_;

    $seq1 = uc($seq1);
    $seq2 = uc($seq2);
    
    my $len = length($seq1);
    my $mismatch_count = 0;

    for (my $i = 0; $i < $len; $i++) {
        my $charSeq1 = substr($seq1, $i, 1);
        my $charSeq2 = substr($seq2, $i, 1);
        next if (! ($charSeq1 =~ /ACGT/ && $charSeq2 =~ /ACGT/));

        if ($charSeq1 ne $charSeq2) {
            return 0;
        }
    }
    return 1;
}


sub remap {
    my ($curMapping, $curGenomeSequence) = @_;

    my $searchSequence = $curMapping->getTargetSeqInGenome();
    my $startCoordBias = min(0,$curMapping->getAbsPos() - $regionSearchFlank -1);  ### If this is the start of the choromosome and we donot have enough flank sequeuence
                                                                     ## record where we started
    my $regionSequence = substr($curGenomeSequence->{cleanChrName($curMapping->getAbsChr() )}, max(0,$curMapping->getAbsPos() - $regionSearchFlank - 1), $curMapping->getAbsLen()+$regionSearchFlank*2);

    my @searchPositions = (0);
    foreach my $curFlankPos (1..$regionSearchFlank) {
        push @searchPositions, $curFlankPos;
        push @searchPositions, -$curFlankPos;        
    }
    my $hitPosition = -1;
    my $foundHit=0; 
    foreach my $curSearchPosition (@searchPositions) {
        if(compareSequences(substr($regionSequence, $regionSearchFlank + $searchPositions[$curSearchPosition]+$startCoordBias, length($searchSequence)), $searchSequence) ) { 
            $hitPosition = $curSearchPosition;
            last;
        }
    }

    if($hitPosition != -1) { ### If we found our sequence, update the coordiantes
        my $newAbsStart = $curMapping->getAbsPos() + $searchPositions[$hitPosition];
        return $newAbsStart;
    } else {
        return -1;
    }
}



sub nodeToLevel {
    my ($node) = @_;

    my @ancientNodes = ("N1","N2","N3","N4","N5","N6","N7","N8");
    my @tracheophyteNodes = ("N9","N10","N11");
    my @angiospermsNodes = ("N12","N13","N14");
    my @dicotNodes = ("N15","N16","N17","N18");
    my @rosidNodes = ("N19","N20");
    my @asteridNodes = ("N167","N168", "N170","N182","N192","N193","N169");
    my @monocotNodes = ("N226","N227","N228","N228");

    my @solanaceaeNodes =("N194","N195","N196","N199","N197","N200","N201","N202","N204");
    my @poaceaeNodes=("N233","N234","N235","N236","N237","N238","N239","N240","N248","N249","N252","N255","N253","N251","N250","N243","N245","N246");
    my @fabaceaeNodes=("N90","N91","N92","N111","N93","N100","N94","N97","N95","N98","N101","N102","N103","N104","N106","N105","N109","N107");
    my @brassicaceaeNodes=("N24","N25","N52","N26","N51","N42","N43","N44","N50","N49","N45","N46","N47","N41","N40","N29","N27","N39","N37","N30","N31","N32","N28","N33");
    my @lemnaceaNodes=("N269","N270","N276","N271","N272","N273");
    my @asteraceaNodes =("N172","N173","N178","N180","N179","N174","N176","N175","N177");

    my $conservationLevel = "Order";
    if( grep( /^$node$/, @ancientNodes) ) { $conservationLevel = "Ancient"; }
    if( grep( /^$node$/, @tracheophyteNodes) ) { $conservationLevel = "Tracheophytes"; }
    if( grep( /^$node$/, @angiospermsNodes) ) { $conservationLevel = "Angiosperms"; }
    if( grep( /^$node$/, @dicotNodes) ) { $conservationLevel = "Dicots"; }
    if( grep( /^$node$/, @monocotNodes) ) { $conservationLevel = "Monocots"; }
    if( grep( /^$node$/, @rosidNodes) ) { $conservationLevel = "Rosids"; }
    if( grep( /^$node$/, @asteridNodes) ) { $conservationLevel = "Asterids"; }

   if( grep( /^$node$/, @solanaceaeNodes) ) { $conservationLevel = "Solanaceae"; }
   if( grep( /^$node$/, @poaceaeNodes) ) { $conservationLevel = "Poaceae"; }
   if( grep( /^$node$/, @fabaceaeNodes) ) { $conservationLevel = "Fabaceae"; }
   if( grep( /^$node$/, @brassicaceaeNodes) ) { $conservationLevel = "Brassicaeae"; }
   if( grep( /^$node$/, @lemnaceaNodes) ) { $conservationLevel = "Lemnaceae"; }
   if( grep( /^$node$/, @asteraceaNodes) ) { $conservationLevel = "Asteraceae"; }

    return $conservationLevel;
}
